notebooks/Unit 9 - Model Optimization/tensorrt.ipynb

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# TensorRT\n", "\n", "In this notebook, we will use TensorRT to optimize a PyTorch model for inference. We will train a simple CNN model on the MNIST dataset, convert it to TensorRT engine using ONNX, and then perform inference using the optimized TensorRT engine model and evaluate the size and accuracy of the model. This notebook require a NVIDIA GPU with CUDA support or NVIDIA Jetson device." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup TensorRT\n", "\n", "First, install tensorrt and torch using pip and import the neccesary modules" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%pip install torch torchvision\n", "%pip install tensorrt==8.6.1\n", "%pip install pycuda onnx onnxruntime\n", "%pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com pytorch-quantization==2.1.2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import torch.optim as optim\n", "from torchvision import datasets, transforms\n", "import torch.quantization\n", "import pathlib\n", "import numpy as np\n", "import torch.onnx\n", "import tensorrt as trt\n", "import pycuda.driver as cuda\n", "import pycuda.autoinit\n", "import onnx\n", "import onnxruntime\n", "\n", "from pytorch_quantization import nn as quant_nn\n", "from pytorch_quantization import quant_modules\n", "from pytorch_quantization import calib\n", "from tqdm import tqdm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train PyTorch Model and Export to ONNX\n", "\n", "Next, train a simple CNN model on the MNIST dataset and export it to ONNX format" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "transform=transforms.Compose([\n", " transforms.ToTensor(),\n", " transforms.Normalize((0.1307,), (0.3081,))\n", " ])\n", "\n", "train_dataset = datasets.MNIST('./data', train=True, download=True,transform=transform)\n", "test_dataset = datasets.MNIST('./data', train=False,transform=transform)\n", "\n", "class Net(nn.Module):\n", " def __init__(self):\n", " super(Net, self).__init__()\n", " self.conv1 = nn.Conv2d(in_channels=1, out_channels=12, kernel_size=3)\n", " self.pool = nn.MaxPool2d(kernel_size=2, stride=2)\n", " self.fc = nn.Linear(12 * 13 * 13, 10)\n", "\n", " def forward(self, x):\n", " x = x.view(-1, 1, 28, 28) \n", " x = F.relu(self.conv1(x))\n", " x = self.pool(x)\n", " x = x.view(x.size(0), -1) \n", " x = self.fc(x)\n", " output = F.log_softmax(x, dim=1)\n", " return output\n", "\n", "\n", "train_loader = torch.utils.data.DataLoader(train_dataset, 32)\n", "test_loader = torch.utils.data.DataLoader(test_dataset, 32)\n", "\n", "device = \"cpu\"\n", "\n", "epochs = 1\n", "\n", "model = Net().to(device)\n", "optimizer = optim.Adam(model.parameters())\n", "\n", "model.train()\n", "\n", "for epoch in range(1, epochs+1):\n", " for batch_idx, (data, target) in enumerate(train_loader):\n", " data, target = data.to(device), target.to(device)\n", " optimizer.zero_grad()\n", " output = model(data)\n", " loss = F.nll_loss(output, target)\n", " loss.backward()\n", " optimizer.step()\n", " print('Train Epoch: {} [{}/{} ({:.0f}%)]\\tLoss: {:.6f}'.format(\n", " epoch, batch_idx * len(data), len(train_loader.dataset),\n", " 100. * batch_idx / len(train_loader), loss.item()))\n", "\n", "MODEL_DIR = pathlib.Path(\"./models\")\n", "MODEL_DIR.mkdir(exist_ok=True)\n", "torch.save(model.state_dict(), MODEL_DIR / \"original_model.p\")\n", "\n", "x, _ = next(iter(train_loader))\n", "torch.onnx.export(model, \n", " x, \n", " MODEL_DIR / \"mnist_model.onnx\", \n", " export_params=True, \n", " opset_version=10, \n", " do_constant_folding=True, \n", " input_names = ['input'], \n", " output_names = ['output'], \n", " dynamic_axes={'input' : {0 : 'batch_size'}, \n", " 'output' : {0 : 'batch_size'}})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Convert ONNX Model to TensorRT\n", "\n", "To convert the ONNX model to TensorRT engine using the TensorRT Python API. First, initialize TensorRT components which are logger, builder, and network. Next, define ONNX parser to parse the ONNX model from the ONNX file into the TensorRT network. Then, create a builder configuration to set building parameters and a memory pool limit for the workspace in TensorRT. Then, create an optimization profile to handle dynamic input shapes with batch size of 32, channel size of 1, and image dimensions of 28x28. Next, built and serialized the TensorRT engine using the configured network and builder and then saved to disk. Finally, the script cleans up by deleting the builder and network objects to free up resources." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "onnx_path = MODEL_DIR / \"mnist_model.onnx\"\n", "trt_path = MODEL_DIR / 'mnist_engine_pytorch.trt'\n", "\n", "# initialize TensorRT engine and parse ONNX model\n", "logger = trt.Logger(trt.Logger.WARNING)\n", "builder = trt.Builder(logger)\n", "network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))\n", "\n", "parser = trt.OnnxParser(network, logger)\n", "parser.parse_from_file(str(onnx_path))\n", "\n", "# set up the builder config and coptimization profile\n", "config = builder.create_builder_config()\n", "config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) \n", "\n", "profile = builder.create_optimization_profile()\n", "profile.set_shape(\"input\", (32, 1, 28, 28), (32, 1, 28, 28), (32, 1, 28, 28)) \n", "config.add_optimization_profile(profile)\n", "\n", "# serialize the engine, then save to disk\n", "serialized_engine = builder.build_serialized_network(network, config)\n", "with open(str(trt_path), 'wb') as f:\n", " f.write(serialized_engine)\n", "\n", "# free up resources\n", "del builder\n", "del network" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Run Inference and Check Accuracy\n", "\n", "Finally, run inference and then compare the TensorRT engine model accuracy with the ONNX model on the test dataset.\n", "\n", "To run test ONNX model, load the model and test model model integrity and then loop over the given Data Loader, For each batch, convert the input data to a NumPy array and fed into the ONNX Runtime session. Once, obtained the output convert back to a PyTorch tensor. Then, calculate the accumulated negative log likelihood loss\n", "and number of correct predictions to measure the accuracy of the model.\n", "\n", "To test the tensorRT model, first, load the serialized engine from disk, and initialize TensorRT runtime. Then, deserialized the engine and create execution context is created. Next, allocate memory for input and output data on the GPU, set bindings for the TensorRT execution and create CUDA stream to manage asynchronous data transfers between the CPU and GPU. Then, Loop over the given Data Loader and for each batch, convert the input data to a NumPy array and transfer to the GPU, before executing the model asynchronously, and then transfer the predictions back to the CPU. Run synchronization to ensures proper coordination between threads. Next, reshaped output and convert to a PyTorch tensor to calculate the accumulated negative log likelihood loss and number of correct predictions to measure the accuracy of the model. Finally, free up the memory and CUDA resources " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def to_numpy(tensor):\n", " return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()\n", "\n", "def test_onnx(model_name, data_loader):\n", " onnx_model = onnx.load(model_name)\n", " onnx.checker.check_model(onnx_model)\n", " ort_session = onnxruntime.InferenceSession(model_name)\n", " test_loss = 0\n", " correct = 0\n", " for data, target in data_loader:\n", " ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(data)}\n", " output = ort_session.run(None, ort_inputs)[0]\n", " output = torch.from_numpy(output)\n", " if target.shape[0] == 32: # last batch might be smaller than 32 (quick fix)\n", " test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss\n", " pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability\n", " correct += pred.eq(target.view_as(pred)).sum().item()\n", " test_loss /= len(data_loader.dataset)\n", " return 100. * correct / len(data_loader.dataset)\n", "\n", "def test_tensorrt(model_name, data_loader):\n", " with open(model_name, \"rb\") as f:\n", " serialized_engine = f.read()\n", " runtime = trt.Runtime(logger)\n", " engine = runtime.deserialize_cuda_engine(serialized_engine)\n", " context = engine.create_execution_context()\n", " input_size = trt.volume(engine.get_binding_shape(0))\n", " output_size = trt.volume(engine.get_binding_shape(1))\n", " # Allocate device memory\n", " d_input = cuda.mem_alloc(input_size * 4) # Assuming 4-byte float32 data type\n", " d_output = cuda.mem_alloc(output_size * 4)\n", " bindings=[int(d_input), int(d_output)]\n", " stream = cuda.Stream()\n", " h_output = np.empty(output_size, dtype=np.float32)\n", " test_loss = 0\n", " correct = 0\n", " for data, target in data_loader:\n", " # Create numpy arrays to hold input and output data\n", " h_input = data.numpy().astype(np.float32)\n", " # Transfer input data to device\n", " cuda.memcpy_htod_async(d_input, h_input, stream)\n", " # Execute model\n", " context.execute_async_v2(bindings, stream.handle, None)\n", " # Transfer predictions back\n", " cuda.memcpy_dtoh_async(h_output, d_output, stream)\n", " # Syncronize threads\n", " stream.synchronize()\n", " output = h_output.reshape(context.get_tensor_shape('output'))\n", " output = torch.from_numpy(output)\n", " if target.shape[0] == 32: # last batch might be smaller than 32 (quick fix)\n", " test_loss += F.nll_loss(output, target, reduction='sum').item() \n", " pred = output.argmax(dim=1, keepdim=True) \n", " correct += pred.eq(target.view_as(pred)).sum().item()\n", " test_loss /= len(data_loader.dataset)\n", " del context\n", " del engine\n", " cuda.Context.pop()\n", " return 100. * correct / len(data_loader.dataset)\n", "\n", "acc = test_onnx(onnx_path, test_loader)\n", "print(f\"Accuracy of the onnx model is {acc}%\")\n", "\n", "trtr_acc = test_tensorrt(trt_path, test_loader)\n", "print(f\"Accuracy of the tensorrt model is {trtr_acc}%\")" ] } ], "metadata": { "kernelspec": { "display_name": "model_optimization", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 2 }

notebooks/Unit 9 - Model Optimization/tensorrt.ipynb (293 lines of code) (raw):